Visualize all Observations by State & Species

data(us.cities)

# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
  out <- us.cities %>% 
  filter(country.etc==s) %>%
  mutate(city = gsub(paste0(" ", s), "", name)) %>%
  arrange(-pop)
  if (s == "OR") {
    out <- out %>% 
      head() %>%
      filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
                           "Beaverton", "Springfield")))
  } else if (s == "CO") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
  } else if (s == "NC") {
    out <- out %>%
      head() %>%
      filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
  } else {
    out <- out %>% head()
  }
  out
})

# Load the map data
states <- map_data("state") %>% 
  filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))

# Load your data
data.files <- list.files("data/final", full.names = T)

df <- purrr::map_df(data.files, readRDS) 

caps.after.ws <- function(string) {
  gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}

# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
  st <- case_when(st.abbr == "CO" ~ "colorado",
                  st.abbr == "NC" ~ "north carolina",
                  st.abbr == "VT" ~ "vermont",
                  st.abbr == "OR" ~ "oregon",
                  T ~ "")
  
  title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec), 
                             "Observations, 2016-2019"))
  
  p <- ggplot(data = states %>% filter(region == st)) +
    geom_polygon(aes(x = long, y = lat, group = group),
                 fill = "#989875", color = "black") +
    geom_point(data = df %>% filter(state == st.abbr & common.name == spec), 
               aes(x = lon, y = lat), 
               size=1, alpha=.5, fill = "red", shape=21) +
    geom_point(data = top.cities %>% filter(country.etc == st.abbr), 
               aes(x=long, y=lat),
               fill="gold", color="black", size=3.5, shape = 21) + 
    geom_text(data = top.cities %>% filter(country.etc == st.abbr), 
              aes(x=long, y=lat, label=city),
              color="white", hjust=case_when(st.abbr=="NC"~.2,
                                               st.abbr=="VT"~.65,
                                               T~.5),
              vjust=ifelse(st.abbr=="NC", -.65, 1.5),
              size=4) + 
    coord_map() +
    ggtitle(title) +
    theme_minimal() +
    theme(panel.background = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank(),
          panel.grid = element_blank())

  data.table(
    state=st.abbr,
    species=spec,
    plot=list(p)
  )
}

spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
  rename(spec=Var1, st.abbr=Var2) 

# Create a list of plots
plots <- purrr::map2_df(spec.state$spec, 
                        spec.state$st.abbr, 
                        ~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Ruddy Duck"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Belted Kingfisher"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Wild Turkey plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Wild Turkey"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sharp-shinned Hawk"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Downy Woodpecker"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Cedar Waxwing"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sandhill Crane"]$plot, 
          list(nrow=2, ncol=2)))

# Plot Sanderling Plots
do.call(ggpubr::ggarrange, 
        c(plots[species == "Sanderling"]$plot, 
          list(nrow=2, ncol=2)))

Explore Explanatory Rasters

states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states

TODO: - terra::freq - terra::density - terra::layerCor

Principal Component Analysis

r.df <- map_df(states, function(s) {
  df <- r.list[[s]] %>% as.data.frame()
  names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
  df %>% setDT()
  df[, state := factor(s, levels=states)]
  df[apply(df, 1, function(.x) !any(is.na(.x)))]
}) 

# Custom function to process factor levels
clean.levels <- function(x) {
  # Remove non-alphanumeric characters and replace with underscores
  x <- gsub("[^a-zA-Z0-9]", "_", x)
  # Convert to lowercase
  x <- tolower(x)
  # Remove any leading or trailing underscores
  x <- gsub("^_|_$", "", x)
  x <- gsub("__", "_", x)
  x <- gsub("NLCD_Land_", "", x)
  return(x)
}

r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]

# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1, data = r.df[, .(NLCD_Land, state)])) %>%
  cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F]) 

names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))

# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
  as.data.frame() %>%
  filter(V1 == 1) %>%
  row.names()

if (length(uniq.1) >= 1) {
  df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}


pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")

res <- pca.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Factor Analysis for Mixed Data

famd.fit <- FAMD(r.df, graph=F)

ggpubr::ggarrange(plotlist=purrr::map(
  c("var", "quanti", "quali"), 
  ~plot.FAMD(famd.fit, choix=.x)),
  ncol=1, nrow=3)

res <- famd.fit$var$coord %>%
  as.data.frame() %>%
  mutate(var=as.factor(rownames(.))) %>%
  select(var, everything()) %>%
  as_tibble()
rownames(res) <- NULL
  
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
  geom_bar(stat = "identity", fill="darkblue") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
  theme_minimal()

p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
  geom_bar(stat = "identity", fill="darkred") +
  coord_flip() +  # Makes it a horizontal bar chart
  labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
  theme_minimal()

ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)

Pseudo-Absence Generation

First, get all of the grid-cell geometries (based on the resolution of the rasters) as spatial dataframes. Also extract each cell’s centroid, and row/column index from the original raster.

# Function to compute bounding box from centroid
compute.bbox <- function(x, y, half.res.x, half.res.y) {
  c(x - half.res.x, x + half.res.x, y - half.res.y, y + half.res.y)
}

# Function to generate a single POLYGON from the bounding box coordinates
make.polygon <- function(xmin, ymin, xmax, ymax) {
  m <- matrix(c(xmin, ymin, 
                xmax, ymin, 
                xmax, ymax, 
                xmin, ymax, 
                xmin, ymin), 
              ncol = 2, byrow = T)
  st_polygon(list(m))
}

get.grid.geoms <- function(r, .crs=NULL) {
  
  # Calculate the centroids of each cell
  centroids <- terra::xyFromCell(r, seq_len(ncell(r)))
  # Get resolution / 2 for x & y
  half.res.x <- res(r)[1] / 2
  half.res.y <- res(r)[2] / 2
  # Compute bounding box for each centroid
  bboxes <- t(apply(centroids, 1, function(pt) {
    compute.bbox(pt[1], pt[2], half.res.x, half.res.y)
  }))
  # Create dataframe
  dt <- as.data.table(bboxes)
  colnames(dt) <- c("xmin", "xmax", "ymin", "ymax")
  
  dt[, `:=` (
    # Add centroid lat/lon values to dataframe
    lon=centroids[, 1],
    lat=centroids[, 2],
    # Add i (row) and j (column) indices
    i=rowFromCell(r, 1:ncell(r)),
    j=colFromCell(r, 1:ncell(r))
  )]
  
  # Convert i and j to cell number
  dt[, cell := cellFromRowCol(r, i, j)]

  # Extract cell #'s of raster r where any values are NA
  r.na <- which(apply(as.data.frame(values(is.na(r))), MARGIN = 1, FUN = any))
  
  # Remove those cells from the data
  dt <- dt[-r.na]
  
  # Convert centroids to spatial points
  dt[, centroid := purrr::map2(lon, lat, ~st_point(cbind(.x, .y)))]
  
  # Create bounding box polygons for all rows and assign to geometry column
  dt[, bbox := purrr::pmap(.l=list(xmin, ymin, xmax, ymax), 
                               .f=make.polygon)]
  # Make spatial frame
  df <- st_sf(dt, 
              bbox = st_sfc(dt$bbox, crs=st_crs(r)),
              centroid = st_sfc(dt$centroid, crs=st_crs(r)))

  # Update CRS
  if (!is.null(.crs)) {
    df <- st_transform(df, st_crs(.crs)) %>%
      st_set_geometry("centroid") %>%
      st_transform(st_crs(.crs)) %>%
      st_set_geometry("bbox")
  }
  
  df %>% select(i, j, cell, bbox, centroid)
}

.grids <- purrr::map(r.list, ~get.grid.geoms(.x, .crs=4326))
names(.grids) <- states

# See sample grid dataframe
.grids$NC
# Combine grid-cell data into single data frame
grid.df <- purrr::map_df(states, ~mutate(.grids[[.x]], state=.x)) %>%
# Get cells surrounding each cell in the original grid
  mutate(i.j=paste(i, j, sep="_")) %>%
  mutate(
    # top-left
    tl.i = ifelse(paste(i - 1, j - 1, sep="_") %in% i.j, i - 1, NA),
    tl.j = ifelse(paste(i - 1, j - 1, sep="_") %in% i.j, j - 1, NA),
    # top
    t.i = ifelse(paste(i, j - 1, sep="_") %in% i.j, i, NA),
    t.j = ifelse(paste(i, j - 1, sep="_") %in% i.j, j - 1, NA),
    # top-right
    tr.i = ifelse(paste(i + 1, j - 1, sep="_") %in% i.j, i + 1, NA),
    tr.j = ifelse(paste(i + 1, j - 1, sep="_") %in% i.j, j - 1, NA),
    # right
    r.i = ifelse(paste(i + 1, j, sep="_") %in% i.j, i + 1, NA),
    r.j = ifelse(paste(i + 1, j, sep="_") %in% i.j, j, NA),
    # bottom-right
    br.i = ifelse(paste(i + 1, j + 1, sep="_") %in% i.j, i + 1, NA),
    br.j = ifelse(paste(i + 1, j + 1, sep="_") %in% i.j, j + 1, NA),
    # bottom
    b.i = ifelse(paste(i, j + 1, sep="_") %in% i.j, i, NA),
    b.j = ifelse(paste(i, j + 1, sep="_") %in% i.j, j + 1, NA),
    # bottom-left
    bl.i = ifelse(paste(i - 1, j + 1, sep="_") %in% i.j, i - 1, NA),
    bl.j = ifelse(paste(i - 1, j + 1, sep="_") %in% i.j, j + 1, NA),
    # left
    l.i = ifelse(paste(i - 1, j, sep="_") %in% i.j, i - 1, NA),
    l.j = ifelse(paste(i - 1, j, sep="_") %in% i.j, j, NA)
  ) %>% 
  select(-i.j)

# Get observation data
obs.df <- df %>%
  select(state, common.name, observation.point=geometry)

# Confirm matching CRS
obs.df <- st_transform(obs.df, st_crs(grid.df))

# Join the observation points to grid cells based on spatial intersection
joined.df <- st_join(obs.df, select(grid.df, -state), left = T)

# Extract all i and j values from the joined.df that intersected with obs.df
intersected.grid <- unique(joined.df[, c("i", "j", "common.name", "state")]) %>% 
  as.data.frame()

# Define the suffixes for each direction
directions <- c("tl", "t", "tr", "r", "br", "b", "bl", "l")

# Create a function to extract intersections for each direction
extract.intersections <- function(direction, joined.df) {
  i.col <- paste0(direction, ".i")
  j.col <- paste0(direction, ".j")
  
  joined.df %>% 
    filter(!is.na(.[[i.col]]) & !is.na(.[[j.col]])) %>%
    select(i = all_of(i.col), j = all_of(j.col), state, common.name) %>%
    unique() %>%
    as.data.frame()
}

# Use purrr::map to generate a list of dataframes
dilation.dfs <- map(directions, ~extract.intersections(.x, joined.df))
names(dilation.dfs) <- directions
dilation.dfs$original <- intersected.grid

intersected.grid.all <- do.call("rbind", dilation.dfs) %>%
  `rownames<-`(NULL) %>%
  unique()

# Get state/species combinations
state.spec <- expand.grid(unique(obs.df$common.name), states) %>%
  rename(common.name=Var1, state=Var2)

# Get all cells in grid that do NOT have an observation in them,
# by state and species
sampling.grid <- purrr::map_df(1:nrow(state.spec), function(.x) {
  .ss <- state.spec[.x, ]
  .ig <- filter(intersected.grid, 
                state == .ss$state & common.name == .ss$common.name)
  .g <- grid.df %>% filter(state == .ss$state)
  no.obs.grid <- anti_join(.g, .ig, by=c("i", "j")) %>%
    mutate(common.name = .ss$common.name)
})

# Get all cells in grid that do NOT have an observation in them,
# nor do the surrounding cells, by state and species
sampling.grid.with.dilation <- purrr::map_df(1:nrow(state.spec), function(.x) {
  .ss <- state.spec[.x, ]
  .ig <- filter(intersected.grid.all, 
                state == .ss$state & common.name == .ss$common.name)
  .g <- grid.df %>% filter(state == .ss$state)
  no.obs.grid <- anti_join(.g, .ig, by=c("i", "j")) %>%
    mutate(common.name = .ss$common.name)
})


# Display the non-intersected grid cells
sampling.grid %>%
  as.data.frame() %>%
  group_by(state, common.name) %>%
  summarize(`Total Cells Available`=n(), .groups="keep") %>%
  left_join(
    grid.df %>%
      as.data.frame() %>%
      group_by(state) %>%
      summarize(`Total Cells in State`=n(), .groups="keep"),
    by="state"
  ) %>%
  mutate(`% Available` = round(100 * `Total Cells Available` / `Total Cells in State`, 2)) %>%
  as_tibble()
# Display the non-intersected grid cells, with dilation
sampling.grid.with.dilation %>%
  as.data.frame() %>%
  group_by(state, common.name) %>%
  summarize(`Total Cells Available`=n(), .groups="keep") %>%
  left_join(
    grid.df %>%
      as.data.frame() %>%
      group_by(state) %>%
      summarize(`Total Cells in State`=n(), .groups="keep"),
    by="state"
  ) %>%
  mutate(`% Available` = round(100 * `Total Cells Available` / `Total Cells in State`, 2)) %>%
  as_tibble()
# Load the map data
states.df <- map_data("state") %>% 
  filter(region %in% c("oregon", "north carolina", "colorado", "vermont")) %>%
  st_as_sf(., coords = c("long", "lat"), crs = st_crs(sampling.grid), agr = "constant") %>%
  group_by(region) %>%
  summarize(do_union = F) %>% 
  st_cast("POLYGON") %>%
  mutate(state=case_when(
    region=="colorado"~"CO",
    region=="north carolina"~"NC",
    region=="oregon"~"OR",
    region=="vermont"~"VT",
    T~"")
  ) %>%
  select(-region)


# Plots
available.cells <- purrr::map(
  states,
  function(.state) {
    ggpubr::ggarrange(
      plotlist=purrr::map(
        sort(unique(sampling.grid$common.name)),
        function(spec) {
          d.spec <- sampling.grid %>%
            filter(state == .state & common.name == spec)
          d.state <- states.df %>% 
            filter(state == .state)
          ggplot() + 
            geom_sf(data=d.state, fill="white", color="black", size=0.5) +
            geom_sf(data=d.spec, fill="darkgreen", color="black", size=0.2) +
            theme_minimal() + 
            labs(title=paste0("Available sampling regions in\n",
                              .state, " for ", spec))
        }
      ), ncol=2,  nrow=4, align="hv", widths=c(1, 1.2))
    })

names(available.cells) <- states

# Colorado
available.cells$CO

# Available grid cells, NC
available.cells$NC

# Available grid cells, OR
available.cells$OR

# Available grid cells, VT
available.cells$VT

# TODO: Select pseudo-absence points

Train/Test Splitting

TODO: Include pseudo-absence data

stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
  # Cut along lat/lon values to create grids (lat.bin & lon.bin)
  # lat.lon.bins is the number of divisions you want
  df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
  df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
  
  # Create a new variable combining the stratification variables
  df %>%
    mutate(strata = paste(lat.bin, lon.bin, common.name, state)) %>%
    pull(strata) %>%
    # Create the data partitions
    createDataPartition(., p = p, list = F) %>%
    suppressWarnings()
}

prepare.data <- function(df, p=.7, lat.lon.bins=25) {
  train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
  df.train <- df[train.index, ]
  df.test <- df[-train.index, ]
  
  list(train = df.train, 
       test = df.test,
       index = train.index)
}

train.test <- prepare.data(df, .7)
train <- df[train.test$index,]
test <- df[-train.test$index,]

EDA With Pseudo-Absence Data

Autocorrelation Mitigation

Feature Engineering

Land Cover Hierarchical Updates to Categories

Each of the 20 different Land Cover Categories falls under a “parent” category (see National Land Cover Database Class Legend and Description).

Feature Selection